In [ ]:
import re
import sys
import warnings
import pandas as pd
from pandasql import sqldf
# Ignore all warnings
warnings.filterwarnings("ignore")
import os
import pyodbc
import pandas as pd
import pandera as pa
import csv
#from bs4 import BeautifulSoup
#from pandera import DataFrameSchema,Column, Check, Index, MultiIndex
#import sqlalchemy;
from os import environ;
pd.set_option('display.max_columns', None)
dirpath = "./"

# Check if directory exists
if not os.path.exists(dirpath):
    raise FileNotFoundError(f"The directory {dirpath} does not exist.")
else:
    print(f"The directory {dirpath} exists.")
    # List all files in the directory
files_in_dir = os.listdir(dirpath)
print("Files in directory:", files_in_dir)
all_filename = 'Counties_Percapita_Formatted.csv'
singlefamily_filename = 'Counties_Percapita_FormattedZip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv.csv'
ThreeBedRoom_filename = 'Counties_Percapita_FormattedZip_zhvi_uc_ThreeBedRoom_tier_0.33_0.67_sm_sa_month.csv.csv'
threebed_filename = 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv'
twobed_filename = 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv'
condo_filename = 'Counties_Percapita_FormattedZip_zhvi_uc_condo_tier_0.33_0.67_sm_sa_month.csv.csv'
all_filepath = os.path.join(dirpath, all_filename)
allhomes_df = pd.read_csv(all_filepath, header=0)  # Read without headers

sf_filepath = os.path.join(dirpath, singlefamily_filename)
sf_df = pd.read_csv(sf_filepath, header=0)  # Read without headers

condo_filepath = os.path.join(dirpath, condo_filename)
condo_df = pd.read_csv(condo_filepath, header=0)  # Read without headers
threebed_filepath = os.path.join(dirpath, threebed_filename)
threebed_df = pd.read_csv(threebed_filepath, header=0)  # Read without headers
twobed_filepath = os.path.join(dirpath, twobed_filename)
twobed_df = pd.read_csv(twobed_filepath, header=0)  # Read without headers
The directory C:\Users\req92163\OneDrive - Mutual of Omaha\DSCI8950\DS_2025_TeamE\data_files exists.
Files in directory: ['bkp_files', 'cfips_location.csv', 'Counties_Percapita_Formatted.csv', 'Counties_Percapita_FormattedZipcode_zhvi_sfcondo_tier.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_uc_condo_tier_0.33_0.67_sm_sa_month.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZipcode_zhvi_sfcondo_tier.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_uc_condo_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Formatted_Income.csv', 'Percapita_Income.csv', 'test.txt', 'zillow_combined_zhvi.csv']
In [4]:
twobed_df.rename(columns={    
    'HouseValue': 'AnnualValue_TwoBedRoom',
    'prev_HouseValue': 'Previous_AnnualValue_TwoBedRoom',
    'annual_increase': 'Annual_Increase_TwoBedRoom',
    'BasePrice': '5Year_BasePrice_TwoBedRoom',
    'increase_base_pct': '5Year_Percent_Increase_TwoBedRoom',
    'Rank_Annual': 'Annual_Rank_TwoBedRoom',
    'annual_value_category': 'Annual_Value_Category_TwoBedRoom'
}, inplace=True)
#twobed_df.columns
threebed_df.rename(columns={    
    'HouseValue': 'AnnualValue_ThreeBedRoom',
    'prev_HouseValue': 'Previous_AnnualValue_ThreeBedRoom',
    'annual_increase': 'Annual_Increase_ThreeBedRoom',
    'BasePrice': '5Year_BasePrice_ThreeBedRoom',
    'increase_base_pct': '5Year_Percent_Increase_ThreeBedRoom',
    'Rank_Annual': 'Annual_Rank_ThreeBedRoom',
    'annual_value_category': 'Annual_Value_Category_ThreeBedRoom'
}, inplace=True)
#threebed_df.columns
allhomes_df.rename(columns={    
    'HouseValue': 'AnnualValue_AllHomes',
    'prev_HouseValue': 'Previous_AnnualValue_AllHomes',
    'annual_increase': 'Annual_Increase_AllHomes',
    'BasePrice': '5Year_BasePrice_AllHomes',
    'increase_base_pct': '5Year_Percent_Increase_AllHomes',
    'Rank_Annual': 'Annual_Rank_AllHomes',
    'annual_value_category': 'Annual_Value_Category_AllHomes'
}, inplace=True)
#allhomes_df.columns
sf_df.rename(columns={    
    'HouseValue': 'AnnualValue_SingleFamily',
    'prev_HouseValue': 'Previous_AnnualValue_SingleFamily',
    'annual_increase': 'Annual_Increase_SingleFamily',
    'BasePrice': '5Year_BasePrice_SingleFamily',
    'increase_base_pct': '5Year_Percent_Increase_SingleFamily',
    'Rank_Annual': 'Annual_Rank_SingleFamily',
    'annual_value_category': 'Annual_Value_Category_SingleFamily'
}, inplace=True)
#sf_df.columns
condo_df.rename(columns={    
    'HouseValue': 'AnnualValue_Condo',
    'prev_HouseValue': 'Previous_AnnualValue_Condo',
    'annual_increase': 'Annual_Increase_Condo',
    'BasePrice': '5Year_BasePrice_Condo',
    'increase_base_pct': '5Year_Percent_Increase_Condo',
    'Rank_Annual': 'Annual_Rank_Condo',
    'annual_value_category': 'Annual_Value_Category_Condo'
}, inplace=True)
condo_df.columns
Out[4]:
Index(['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng',
       'Year', '5Year_BasePrice_Condo', 'AnnualValue_Condo',
       'Previous_AnnualValue_Condo', '5Year_Percent_Increase_Condo',
       'Annual_Increase_Condo', 'Annual_Rank_Condo',
       'Annual_Value_Category_Condo', 'Personal_Income_Raise',
       'Population_Raise', 'Percapita_Income_Raise'],
      dtype='object')
In [6]:
# Initialize merged_df with allhomes_df
merged_df = allhomes_df[['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year', 
                         'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise', 
                         '5Year_BasePrice_AllHomes', 'AnnualValue_AllHomes', 'Previous_AnnualValue_AllHomes', 
                         '5Year_Percent_Increase_AllHomes', 'Annual_Increase_AllHomes', 'Annual_Rank_AllHomes', 
                         'Annual_Value_Category_AllHomes']].copy()
# Select only 'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise' columns from allhomes_df
merged_df = merged_df[['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year', 
                       'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise', 
                       '5Year_BasePrice_AllHomes', 'AnnualValue_AllHomes', 'Previous_AnnualValue_AllHomes', 
                       '5Year_Percent_Increase_AllHomes', 'Annual_Increase_AllHomes', 'Annual_Rank_AllHomes', 
                       'Annual_Value_Category_AllHomes']]
# Merge all dataframes on the specified keys
#keys = ['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year', 'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise']
keys = ['CountyName', 'StateName',  'Year']
merged_df = merged_df.merge(sf_df, on=keys, how='outer', suffixes=('', '_SingleFamily'))
merged_df = merged_df.merge(condo_df, on=keys, how='inner', suffixes=('', '_Condo'))
merged_df = merged_df.merge(threebed_df, on=keys, how='outer', suffixes=('', '_ThreeBedRoom'))
merged_df = merged_df.merge(twobed_df, on=keys, how='outer', suffixes=('', '_TwoBedRoom'))

merged_df.rename(columns={
    'CountyName': 'County',
    'StateName': 'State',
    'FIPS': 'FIPS_Code',
    'City': 'City_Name',
    'Metro': 'Metropolitan_Area',
    'lat': 'Latitude',
    'lng': 'Longitude',
    'Year': 'Year_Recorded',
    'Personal_Income_Raise': 'Personal_Income_Growth',
    'Population_Raise': 'Population_Growth',
    'Percapita_Income_Raise': 'Per_Capita_Income_Growth'
},inplace=True)
# Display the first few rows of the merged dataframe
# Initialize merged_df with allhomes_df
merged_df = allhomes_df[['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year', 
                         'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise', 
                         '5Year_BasePrice_AllHomes',  '5Year_Percent_Increase_AllHomes', 'AnnualValue_AllHomes', 'Previous_AnnualValue_AllHomes', 
                        'Annual_Increase_AllHomes', 'Annual_Rank_AllHomes', 
                         'Annual_Value_Category_AllHomes']].copy()

# Merge all dataframes on the specified keys
keys = ['CountyName', 'StateName',  'Year']
merged_df = merged_df.merge(sf_df[['CountyName', 'StateName',  'Year', 
                                   '5Year_BasePrice_SingleFamily','5Year_Percent_Increase_SingleFamily',  'AnnualValue_SingleFamily', 
                                   'Previous_AnnualValue_SingleFamily', 
                                   'Annual_Increase_SingleFamily', 'Annual_Rank_SingleFamily', 
                                   'Annual_Value_Category_SingleFamily']], on=keys, how='outer')

merged_df = merged_df.merge(condo_df[['CountyName', 'StateName',  'Year', 
                                      '5Year_BasePrice_Condo', '5Year_Percent_Increase_Condo', 'AnnualValue_Condo', 
                                      'Previous_AnnualValue_Condo', 
                                      'Annual_Increase_Condo', 'Annual_Rank_Condo', 
                                      'Annual_Value_Category_Condo']], on=keys, how='outer')

merged_df = merged_df.merge(threebed_df[['CountyName', 'StateName', 'Year', 
                                         '5Year_BasePrice_ThreeBedRoom','5Year_Percent_Increase_ThreeBedRoom',  'AnnualValue_ThreeBedRoom', 
                                         'Previous_AnnualValue_ThreeBedRoom', 
                                         'Annual_Increase_ThreeBedRoom', 'Annual_Rank_ThreeBedRoom', 
                                         'Annual_Value_Category_ThreeBedRoom']], on=keys, how='outer')

merged_df = merged_df.merge(twobed_df[['CountyName', 'StateName',  'Year', 
                                       '5Year_BasePrice_TwoBedRoom','5Year_Percent_Increase_TwoBedRoom',  'AnnualValue_TwoBedRoom', 
                                       'Previous_AnnualValue_TwoBedRoom', 
                                       'Annual_Increase_TwoBedRoom', 'Annual_Rank_TwoBedRoom', 
                                       'Annual_Value_Category_TwoBedRoom']], on=keys, how='outer')

merged_df.rename(columns={
    'CountyName': 'County',
    'StateName': 'State',
    'FIPS': 'FIPS_Code',
    'City': 'City_Name',
    'Metro': 'Metropolitan_Area',
    'lat': 'Latitude',
    'lng': 'Longitude',
    'Year': 'Year_Recorded',
    'Personal_Income_Raise': 'Personal_Income_Growth',
    'Population_Raise': 'Population_Growth',
    'Percapita_Income_Raise': 'Per_Capita_Income_Growth'
}, inplace=True)

# Reorder columns
#merged_df = merged_df[column_order]

# Display the first few rows of the merged dataframe
merged_df.head(15)
Out[6]:
County State FIPS_Code City_Name Metropolitan_Area Latitude Longitude Year_Recorded Personal_Income_Growth Population_Growth Per_Capita_Income_Growth 5Year_BasePrice_AllHomes 5Year_Percent_Increase_AllHomes AnnualValue_AllHomes Previous_AnnualValue_AllHomes Annual_Increase_AllHomes Annual_Rank_AllHomes Annual_Value_Category_AllHomes 5Year_BasePrice_SingleFamily 5Year_Percent_Increase_SingleFamily AnnualValue_SingleFamily Previous_AnnualValue_SingleFamily Annual_Increase_SingleFamily Annual_Rank_SingleFamily Annual_Value_Category_SingleFamily 5Year_BasePrice_Condo 5Year_Percent_Increase_Condo AnnualValue_Condo Previous_AnnualValue_Condo Annual_Increase_Condo Annual_Rank_Condo Annual_Value_Category_Condo 5Year_BasePrice_ThreeBedRoom 5Year_Percent_Increase_ThreeBedRoom AnnualValue_ThreeBedRoom Previous_AnnualValue_ThreeBedRoom Annual_Increase_ThreeBedRoom Annual_Rank_ThreeBedRoom Annual_Value_Category_ThreeBedRoom 5Year_BasePrice_TwoBedRoom 5Year_Percent_Increase_TwoBedRoom AnnualValue_TwoBedRoom Previous_AnnualValue_TwoBedRoom Annual_Increase_TwoBedRoom Annual_Rank_TwoBedRoom Annual_Value_Category_TwoBedRoom
0 ABBEVILLE SC NaN Donalds Greenwood, SC 34.222695 -82.458727 2020 NaN NaN NaN 117403.000000 18.0 138137.000000 117403.000000 18.0 103.0 Above 15 Percent 115696.000000 18.0 136407.000000 115696.000000 18.0 102.0 Above 15 Percent NaN NaN NaN NaN NaN NaN NaN 116718.000000 17.0 136521.000000 116718.000000 17.0 85.5 Above 15 Percent NaN NaN NaN NaN NaN NaN NaN
1 ABBEVILLE SC NaN Donalds Greenwood, SC 34.222695 -82.458727 2021 NaN NaN NaN 117403.000000 39.0 162819.000000 138137.000000 18.0 538.5 Above 15 Percent 115696.000000 39.0 160577.000000 136407.000000 18.0 540.5 Above 15 Percent NaN NaN NaN NaN NaN NaN NaN 116718.000000 35.0 157712.000000 136521.000000 16.0 851.5 Above 15 Percent NaN NaN NaN NaN NaN NaN NaN
2 ABBEVILLE SC 45001.0 Donalds Greenwood, SC 34.222695 -82.458727 2022 7.8 0.0 7.9 117403.000000 56.0 183338.000000 162819.000000 13.0 285.0 Above 12 Percent 115696.000000 57.0 181376.000000 160577.000000 13.0 287.5 Above 12 Percent NaN NaN NaN NaN NaN NaN NaN 116718.000000 60.0 186346.000000 157712.000000 18.0 72.0 Above 15 Percent NaN NaN NaN NaN NaN NaN NaN
3 ABBEVILLE SC 45001.0 Donalds Greenwood, SC 34.222695 -82.458727 2023 2.2 0.3 1.9 117403.000000 54.0 181153.000000 183338.000000 -1.0 1828.0 2 Percent Decrease 115696.000000 55.0 179578.000000 181376.000000 -1.0 1828.5 2 Percent Decrease NaN NaN NaN NaN NaN NaN NaN 116718.000000 67.0 195261.000000 186346.000000 5.0 701.0 Above 5 Percent NaN NaN NaN NaN NaN NaN NaN
4 ABBEVILLE SC 45001.0 Donalds Greenwood, SC 34.222695 -82.458727 2024 5.0 0.4 4.6 117403.000000 58.0 184911.000000 181153.000000 2.0 1709.0 Above 1 Percent 115696.000000 58.0 183180.000000 179578.000000 2.0 1718.0 Above 1 Percent NaN NaN NaN NaN NaN NaN NaN 116718.000000 74.0 203177.000000 195261.000000 4.0 1174.0 Above 3 Percent NaN NaN NaN NaN NaN NaN NaN
5 ADA ID NaN Star Boise City, ID 43.450841 -116.241035 2020 NaN NaN NaN 355784.733333 19.0 424153.000000 355784.733333 19.0 65.5 Above 15 Percent 357113.200000 19.0 426129.933333 357113.200000 19.0 68.0 Above 15 Percent 242491.25 15.0 279064.500 242491.250 15.0 22.5 No Increase 329725.866667 19.0 393318.666667 329725.866667 19.0 40.5 Above 15 Percent 269304.266667 20.0 324277.466667 269304.266667 20.0 77.0 Above 15 Percent
6 ADA ID NaN Star Boise City, ID 43.450841 -116.241035 2021 NaN NaN NaN 355784.733333 58.0 560965.533333 424153.000000 32.0 33.0 Above 15 Percent 357113.200000 58.0 563613.466667 426129.933333 32.0 36.0 Above 15 Percent 242491.25 57.0 381267.625 279064.500 37.0 26.0 Above 15 Percent 329725.866667 57.0 517221.000000 393318.666667 32.0 40.0 Above 15 Percent 269304.266667 59.0 428172.533333 324277.466667 32.0 54.0 Above 15 Percent
7 ADA ID 16001.0 Star Boise City, ID 43.450841 -116.241035 2022 15.6 3.0 12.3 355784.733333 53.0 545204.933333 560965.533333 -3.0 2123.0 4 Percent Decrease 357113.200000 53.0 547932.533333 563613.466667 -3.0 2122.0 4 Percent Decrease 242491.25 54.0 372473.125 381267.625 -2.0 709.5 2 Percent Decrease 329725.866667 53.0 503376.200000 517221.000000 -3.0 1954.0 4 Percent Decrease 269304.266667 55.0 417129.133333 428172.533333 -3.0 1682.5 4 Percent Decrease
8 ADA ID 16001.0 Star Boise City, ID 43.450841 -116.241035 2023 7.1 1.2 5.9 355784.733333 50.0 532802.866667 545204.933333 -2.0 1925.0 2 Percent Decrease 357113.200000 50.0 535748.600000 547932.533333 -2.0 1921.5 2 Percent Decrease 242491.25 42.0 343148.875 372473.125 -8.0 707.5 8 Percent Decrease 329725.866667 50.0 494369.933333 503376.200000 -2.0 1857.0 2 Percent Decrease 269304.266667 50.0 402681.333333 417129.133333 -3.0 1607.0 4 Percent Decrease
9 ADA ID 16001.0 Star Boise City, ID 43.450841 -116.241035 2024 5.9 1.1 4.8 355784.733333 54.0 549132.133333 532802.866667 3.0 1466.0 Above 3 Percent 357113.200000 55.0 552415.266667 535748.600000 3.0 1472.0 Above 3 Percent 242491.25 39.0 336176.500 343148.875 -2.0 599.0 2 Percent Decrease 329725.866667 54.0 509161.866667 494369.933333 3.0 1455.0 Above 3 Percent 269304.266667 52.0 409381.200000 402681.333333 2.0 1295.5 Above 1 Percent
10 ADAIR IA NaN Stuart Des Moines-West Des Moines, IA 41.330756 -94.471059 2020 NaN NaN NaN 146856.333333 10.0 160897.666667 146856.333333 10.0 1332.5 Above 10 Percent 143621.666667 10.0 157272.000000 143621.666667 10.0 1337.0 Above 10 Percent NaN NaN NaN NaN NaN NaN NaN 165108.000000 9.0 180129.500000 165108.000000 9.0 1432.5 Above 8 Percent 139233.000000 8.0 150896.000000 139233.000000 8.0 1352.0 Above 8 Percent
11 ADAIR IA NaN Stuart Des Moines-West Des Moines, IA 41.330756 -94.471059 2021 NaN NaN NaN 146856.333333 23.0 180827.000000 160897.666667 12.0 1321.5 Above 12 Percent 143621.666667 23.0 177125.333333 157272.000000 13.0 1170.5 Above 12 Percent NaN NaN NaN NaN NaN NaN NaN 165108.000000 26.0 207758.000000 180129.500000 15.0 981.5 No Increase 139233.000000 26.0 175802.000000 150896.000000 17.0 675.5 Above 15 Percent
12 ADAIR IA 19001.0 Stuart Des Moines-West Des Moines, IA 41.330756 -94.471059 2022 21.7 0.4 21.2 146856.333333 40.0 205169.333333 180827.000000 13.0 285.0 Above 12 Percent 143621.666667 40.0 200701.666667 177125.333333 13.0 287.5 Above 12 Percent NaN NaN NaN NaN NaN NaN NaN 165108.000000 40.0 231533.500000 207758.000000 11.0 832.5 Above 10 Percent 139233.000000 42.0 198035.000000 175802.000000 13.0 479.0 Above 12 Percent
13 ADAIR IA 19001.0 Stuart Des Moines-West Des Moines, IA 41.330756 -94.471059 2023 4.1 -0.9 5.0 146856.333333 38.0 202807.666667 205169.333333 -1.0 1828.0 2 Percent Decrease 143621.666667 38.0 198284.000000 200701.666667 -1.0 1828.5 2 Percent Decrease NaN NaN NaN NaN NaN NaN NaN 165108.000000 39.0 229555.000000 231533.500000 -1.0 1790.0 2 Percent Decrease 139233.000000 38.0 192032.000000 198035.000000 -3.0 1607.0 4 Percent Decrease
14 ADAIR IA 19001.0 Stuart Des Moines-West Des Moines, IA 41.330756 -94.471059 2024 0.1 -0.9 1.1 146856.333333 47.0 216255.000000 202807.666667 7.0 338.5 Above 5 Percent 143621.666667 47.0 211577.666667 198284.000000 7.0 344.0 Above 5 Percent NaN NaN NaN NaN NaN NaN NaN 165108.000000 49.0 245273.500000 229555.000000 7.0 305.5 Above 5 Percent 139233.000000 50.0 208822.000000 192032.000000 9.0 100.0 Above 8 Percent
In [102]:
# Sort the dataframe by 'State', 'County', and 'Year'
sorted_df = merged_df.sort_values(by=['State', 'County', 'Year_Recorded'])

# Write the sorted dataframe to a CSV file
sorted_df.to_csv(os.path.join(dirpath, 'zillow_combined_zhvi.csv'), index=False)
In [ ]:
 
In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Filter Nebraska data and relevant years
nebraska_df = merged_df[(merged_df['State'] == 'NE') & (merged_df['Year_Recorded'].between(2019, 2024))]

# Melt the dataframe to long format
nebraska_melted_df = nebraska_df.melt(id_vars=['Year_Recorded', 'County'], 
                                      value_vars=['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                                                  'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom'],
                                      var_name='HomeType', value_name='AnnualValue')

# Convert Year to numeric (just in case it's not)
nebraska_melted_df['Year_Recorded'] = nebraska_melted_df['Year_Recorded'].astype(int)

# Set Seaborn style
sns.set(style="whitegrid")

# Create FacetGrid for county-wise visualization
g = sns.FacetGrid(nebraska_melted_df, col="County", col_wrap=4, height=4, sharey=False)
g.map_dataframe(sns.lineplot, x="Year_Recorded", y="AnnualValue", hue="HomeType", marker="o")

# Adjust legends and labels
g.add_legend()
g.set_axis_labels("Year", "Annual Value ($)")
g.set_titles(col_template="{col_name} County")

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

plt.show()
No description has been provided for this image
In [8]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Filter Nebraska data and relevant years
california_df = merged_df[(merged_df['State'] == 'CA') & (merged_df['Year_Recorded'].between(2019, 2024))]

# Melt the dataframe to long format
california_melted_df = california_df.melt(id_vars=['Year_Recorded', 'County'], 
                                      value_vars=['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                                                  'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom'],
                                      var_name='HomeType', value_name='AnnualValue')

# Convert Year to numeric (just in case it's not)
california_melted_df['Year_Recorded'] = california_melted_df['Year_Recorded'].astype(int)

# Set Seaborn style
sns.set(style="whitegrid")

# Create FacetGrid for county-wise visualization
g = sns.FacetGrid(california_melted_df, col="County", col_wrap=4, height=4, sharey=False)
g.map_dataframe(sns.lineplot, x="Year_Recorded", y="AnnualValue", hue="HomeType", marker="o")

# Adjust legends and labels
g.add_legend()
g.set_axis_labels("Year", "Annual Value ($)")
g.set_titles(col_template="{col_name} County")
plt.title("Annual Value Increases across California Counties")
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

plt.show()
No description has been provided for this image
In [9]:
plt.figure(figsize=(12, 6))
sns.kdeplot(data=nebraska_melted_df, x="AnnualValue", hue="County", fill=True, alpha=0.3)

plt.title("Kernel Density Estimate of Annual Home Values (Nebraska Counties)")
plt.xlabel("Annual Home Value ($)")
plt.ylabel("Density")
plt.legend(title="County")
plt.show()
No description has been provided for this image

This Stacked area Plot for Nebraska shows consistent upward trend in annual home values across all categories. Dominance of All Homes and Single Family Homes as they form base of stack and show most significant growth. Also shows significant contribution from Two and Three Bed rooms.Condo category shows least amount of growth compared to other categories.Slope of stacked area suggesting steady and sustatined growth rate in home values.

In [10]:
# Group data by Year and sum the values for each home type
area_data = nebraska_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                                                  'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 
                                                  'AnnualValue_ThreeBedRoom']].sum()

# Plot the stacked area plot
area_data.plot.area(stacked=True, figsize=(12, 6), colormap="coolwarm")

plt.title("Stacked Area Plot: Annual Home Value for Nebraska Increases (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Total Annual Value ($)")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [41]:
# Group data by Year and calculate the mean for each home type
area_data = nebraska_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                                                  'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 
                                                  'AnnualValue_ThreeBedRoom']].mean()

# Reset the index to prepare for melting
area_data.reset_index(inplace=True)

# Melt the dataframe to long format for easier plotting
melted_area_data = area_data.melt(id_vars='Year_Recorded', 
                                  var_name='HomeType', 
                                  value_name='AnnualValue')

melted_area_data['HomeType'] = melted_area_data['HomeType'].str.replace('AnnualValue_', '')

# Create a grouped bar plot with distinct colors
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_area_data, x='Year_Recorded', y='AnnualValue', hue='HomeType', palette='tab10')

# Set custom y-axis ticks and labels
plt.yticks(ticks=[50000, 100000, 150000, 200000], labels=['1', '2', '3', '4'])

# Add titles and labels
plt.title("Mean Annual Home Value for Nebraska (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Home Value (50k) interval")

# Adjust legend to the right side and lower
plt.legend(title="Home Type", bbox_to_anchor=(1.2, 0.5), loc='center right')
plt.xticks(rotation=45)

# Show the plot
plt.show()
No description has been provided for this image
In [42]:
area_data 
Out[42]:
Year_Recorded AnnualValue_AllHomes AnnualValue_SingleFamily AnnualValue_Condo AnnualValue_TwoBedRoom AnnualValue_ThreeBedRoom
0 2020 177474.575154 172517.263046 179463.958333 126276.855898 179030.370435
1 2021 193967.152737 188472.184144 201416.138889 140893.739311 198395.664257
2 2022 216364.318693 210144.002576 215266.583333 155791.905359 220881.031775
3 2023 222824.066206 216459.869567 226748.472222 162000.456708 228420.175757
4 2024 235226.804843 228646.313833 227612.680556 170177.036401 241818.987203
In [43]:
texas_df = merged_df[(merged_df['State'] == 'TX') & (merged_df['Year_Recorded'].between(2019, 2024))]
# Group data by Year and calculate the mean for each home type
area_data = texas_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                                                  'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 
                                                  'AnnualValue_ThreeBedRoom']].mean()

# Reset the index to prepare for melting
area_data.reset_index(inplace=True)

# Melt the dataframe to long format for easier plotting
melted_area_data = area_data.melt(id_vars='Year_Recorded', 
                                  var_name='HomeType', 
                                  value_name='AnnualValue')

melted_area_data['HomeType'] = melted_area_data['HomeType'].str.replace('AnnualValue_', '')

# Create a grouped bar plot with distinct colors
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_area_data, x='Year_Recorded', y='AnnualValue', hue='HomeType', palette='tab10')

# Add titles and labels
plt.title("Mean Annual Home Value for Texas (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Home Value (50k) interval")


# Adjust legend to the right side and lower
plt.legend(title="Home Type", bbox_to_anchor=(1.2, 0.5), loc='center right')
plt.xticks(rotation=45)

# Show the plot
plt.show()
No description has been provided for this image
In [44]:
area_data
Out[44]:
Year_Recorded AnnualValue_AllHomes AnnualValue_SingleFamily AnnualValue_Condo AnnualValue_TwoBedRoom AnnualValue_ThreeBedRoom
0 2020 202722.128510 202145.706569 174742.141054 145588.813376 201051.667907
1 2021 237267.928786 236574.945322 212633.963695 177102.407229 241266.520110
2 2022 249959.277645 249202.798058 235949.709449 196493.159187 266939.151848
3 2023 244409.487229 243612.356229 234366.815257 194003.554493 265470.577329
4 2024 246921.150411 246151.022535 227616.099913 192820.408494 268915.913155
In [38]:
texas_df.groupby('Year_Recorded')[['AnnualValue_Condo', 'AnnualValue_SingleFamily', 
                                               'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom']].mean()
Out[38]:
AnnualValue_Condo AnnualValue_SingleFamily AnnualValue_TwoBedRoom AnnualValue_ThreeBedRoom
Year_Recorded
2020 174742.141054 202145.706569 145588.813376 201051.667907
2021 212633.963695 236574.945322 177102.407229 241266.520110
2022 235949.709449 249202.798058 196493.159187 266939.151848
2023 234366.815257 243612.356229 194003.554493 265470.577329
2024 227616.099913 246151.022535 192820.408494 268915.913155
In [118]:
# Group data by Year and sum the values for each home type
texas_df = merged_df[(merged_df['State'] == 'TX') & (merged_df['Year_Recorded'].between(2019, 2024))]
area_data = texas_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                                                  'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 
                                                  'AnnualValue_ThreeBedRoom']].sum()

# Plot the stacked area plot
area_data.plot.area(stacked=True, figsize=(12, 6), colormap="coolwarm")

plt.title("Stacked Area Plot: Annual Home Value for Texas Increases (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Total Annual Value ($)")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [ ]:
 

Correlation Plot shows strong correlation between AllHomes , Single Family , Tow Bedroom and ThreeBedroom with Positive values and close to 1.This indicates when one of these annual categories increase the others tend to increase as well. This shows conistent overall trend ; when one segment is doing well others tend to follow.

Condos show weaker correlation with other categories across Nebraska. This suggest Condos are not much preferable.

In [122]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a new dataframe with just the columns needed for the correlation matrix
comparison_df = nebraska_df[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily', 
                             'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom']]

# Calculate the correlation matrix
corr_matrix = comparison_df.corr()

# Plot heatmap for the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

plt.title("Correlation Matrix for Nebraska Annual Value Increases")
plt.show()
No description has been provided for this image